{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "# Preprocess Example\n",
    "本Notebook展示简易的数据预处理方法"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "source": [
    "# 基础库\n",
    "# 前两行画图\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# 数据处理\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# 系统库\n",
    "import os, sys\n",
    "\n",
    "# 自带数据\n",
    "datalib_path = os.path.join(os.path.abspath('.'), '../')\n",
    "sys.path.append(datalib_path)\n",
    "import dataset\n",
    "\n",
    "# 忽略warning\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# sklearn\n",
    "from sklearn import preprocessing\n",
    "# import seaborn"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "source": [
    "# load data\n",
    "train = pd.read_csv(os.path.join(dataset.titanic_path, 'train.csv'))\n",
    "test = pd.read_csv(os.path.join(dataset.titanic_path, 'test.csv'))\n",
    "\n",
    "train.head()"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "metadata": {},
     "execution_count": 8
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "source": [
    "# 观察常见特征\n",
    "train[['Age','SibSp']].hist(figsize=(10,4))"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f91141f9198>,\n",
       "        <matplotlib.axes._subplots.AxesSubplot object at 0x7f911416b6d8>]],\n",
       "      dtype=object)"
      ]
     },
     "metadata": {},
     "execution_count": 9
    },
    {
     "output_type": "display_data",
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlYAAAEICAYAAACdyboFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAfsklEQVR4nO3df5QlZX3n8fdnGX8gGgfEdAiDGVxRY5yIZqJkNW4r0QB6BDeGQIgMSjLJBrOazG4cTc5qYjyLm4yIP0IyBnTIIsIRybDIurBox7i7EEFZB0HjSIYw48CoINhgTBq/+8etCZe2h77dXbfvj3m/zrnnVj31VPX3mWoevl1P1VOpKiRJkrR0/2rQAUiSJI0LEytJkqSWmFhJkiS1xMRKkiSpJSZWkiRJLTGxkiRJaomJlSRJPUpyWpKru9YrydMGGZOGi4mVWpFkKsk9SR4z6FgkaamSvCjJ/0lyb5K7k/zvJD9dVRdV1ct7PMajk2xKsjPJdJIdSd7T79g1WCZWWrIkq4GfBQp41UCDkaQlSvJDwJXA+4BDgMOBPwC+t8BDvQVYCzwfeAIwCXy+tUA1lEys1IbTgeuADwPr9hYmeVKS/57kviSfS/JHST7btf2ZSa5p/hr8SpKTlz90SfoBTweoqour6sGq+m5VXV1VX0xyRnc/1jghyW1Jvpnkj5Ps/X/rTwOXV9XXq2NHVV24d6fmCtZbktzSXPH/UJLHLlMb1ScmVmrD6cBFzefnk0w05R8A7gd+hE7C1Z10HQRcA3wE+GHgFOBPkzxrGeOWpLn8HfBgki1Jjk9y8Dz1X03nytTzgBOB1zfl1wG/k+Q3k6xJkjn2PQ34eeBf00nofr+VFmhgTKy0JEleBPwYcGlV3Qh8DfjlJAcAvwC8raoeqKpbgC1du74S2FFVH6qqmar6AnAZ8IvL3ARJepiqug94EZ3bGz4IfCPJFV1/NM72rqq6u6r+AXgPcGpT/l+Ad9FJnm4AdiVZN2vf91fVHVV1N/DOrn01okystFTrgKur6pvN+keasicDK4A7uup2L/8Y8IIk3977odP5/MgyxCxJj6iqbq2qM6pqFfBs4EfpJE1z6e7bbm/q0gwjfqCqXgispJM4XZDkx+fbV6PLxEqLluRA4GTg3ya5M8mdwG8DzwEmgBlgVdcuR3Qt3wH8dVWt7Po8vqr+/XLFL0m9qKov07mH9Nn7qNLdtz0F+Pocx/huVX0AuAfovuVh3n01WkystBQnAQ/S6SSObj4/DvwNnfuuPg68PcnjkjyzKdvrSuDpSV6b5FHN56dn/SUnScuuebBmQ5JVzfoRdIbortvHLv8pycFNvTcClzT7vSnJZJIDk6xohgGfAHyha9+zkqxKcgjwe3v31egysdJSrAM+VFX/UFV37v0A76czrPcG4InAncBfAhfTPK5cVd8BXk7npvWvN3XeBTgPlqRB+w7wAuD6JPfTSahuBjbso/5W4EbgJuATwPlN+QPAJjr92zeBs4BfqKrbuvb9CHA1cBude1T/qNWWaNmlqgYdg/YTSd4F/EhVzb55U5L2O0l2AL9aVf9r0LGoPV6xUt80l9N/Mh3PB84ELh90XJIk9cuKQQegsfYEOsN/PwrcReeS+NaBRiRJUh85FChJktQShwIlSZJaMhRDgYceemitXr163nr3338/Bx10UP8DGpBxbp9tG10Lad+NN974zap6cp9DGmm99ncw3r9btm002baOR+rrhiKxWr16NTfccMO89aamppicnOx/QAMyzu2zbaNrIe1Lcnt/oxl9vfZ3MN6/W7ZtNNm2jkfq6xwKlCRJaomJlSRJUktMrCRJklpiYiVJktQSEytJkqSWmFhJkiS1xMRKkiSpJSZWkiRJLZk3sUpyQZI9SW7uKrskyU3NZ0eSm5ry1Um+27Xtz/oZvCS1LcnKJB9L8uUktyb5mSSHJLkmyVeb74Obukny3iTbk3wxyfMGHb+kwepl5vUPA+8HLtxbUFW/tHc5ySbg3q76X6uqo9sKcNSt3viJnutuWDPDGQuov+PsVywmJEmP7Fzgk1X1miSPBh4HvBW4tqrOTrIR2Ai8GTgeOKr5vAA4r/luxbZd9y6oT+iVfYfUP/NesaqqzwB3z7UtSYCTgYtbjkuSll2SJwIvBs4HqKp/qqpvAycCW5pqW4CTmuUTgQur4zpgZZLDljlsSUMkVTV/pWQ1cGVVPXtW+YuBd1fV2q56XwL+DrgP+P2q+pt9HHM9sB5gYmLipz760Y/OG8f09DSPf/zj5603TLbtunf+So2JA+Gu7/Z+7DWHP3EREQ3GKJ67Xo1z22Bh7XvJS15y497+YBQlORrYDNwCPAe4EXgjsKuqVjZ1AtxTVSuTXAmcXVWfbbZdC7y5qm6YddwF93cAe+6+d0F9Qq+Goe8Y5/9ubNtoaquvW+pLmE/l4VerdgNPqapvJfkp4K+S/ERV3Td7x6raTKcDY+3atdXLiw9H8eWPC7mMv2HNDJu29X5Kdpw2uYiIBmMUz12vxrltMP7tm2UF8Dzgt6rq+iTn0hn2+xdVVUnm/4v04fssuL8DeN9FWxfUJ/RqGPqOcf69sm2jqa22LfqpwCQrgH8HXLK3rKq+V1XfapZvBL4GPH2pQUrSMtkJ7Kyq65v1j9FJtO7aO8TXfO9ptu8Cjujaf1VTJmk/tZTpFn4O+HJV7dxbkOTJSQ5olp9K54bO25YWoiQtj6q6E7gjyTOaomPpDAteAaxrytYBW5vlK4DTm6cDjwHurardyxmzpOEy7zXmJBcDk8ChSXYCb6uq84FT+MGb1l8M/GGSfwa+D/xGVc1547skDanfAi5qngi8DXgdnT9CL01yJnA7nYd2AK4CTgC2Aw80dSXtx+ZNrKrq1H2UnzFH2WXAZUsPS5IGo6puAua6KfXYOeoWcFbfg5I0Mpx5XZIkqSXtP26iZbOQyUcXwskDJUlaHK9YSZIktcTESpIkqSUmVpIkSS0xsZIkSWqJiZUkSVJLTKwkSZJaYmIlSZLUEhMrSZKklphYSZIktcTESpIkqSUmVpIkSS0xsZIkSWqJiZUkSVJLTKwkSZJaYmIlSZLUEhMrSZKklphYSZIktcTESpIkqSXzJlZJLkiyJ8nNXWVvT7IryU3N54SubW9Jsj3JV5L8fL8ClyRJGja9XLH6MHDcHOXnVNXRzecqgCTPAk4BfqLZ50+THNBWsJIkScNs3sSqqj4D3N3j8U4EPlpV36uqvwe2A89fQnySJEkjY8US9n1DktOBG4ANVXUPcDhwXVednU3ZD0iyHlgPMDExwdTU1Lw/cHp6uqd6w2TDmpme604cuLD6/dKPf+NRPHe9Gue2wfi3T5LatNjE6jzgHUA135uA1y/kAFW1GdgMsHbt2pqcnJx3n6mpKXqpN0zO2PiJnutuWDPDpm1LyXXbseO0ydaPOYrnrlfj3DYY//ZJUpsW9VRgVd1VVQ9W1feBD/LQcN8u4IiuqquaMkkaCUl2JNnWPJhzQ1N2SJJrkny1+T64KU+S9zYP7HwxyfMGG72kQVtUYpXksK7VVwN7nxi8AjglyWOSHAkcBfzt0kKUpGX3kubBnLXN+kbg2qo6Cri2WQc4nk4/dxSdWxvOW/ZIJQ2VecedklwMTAKHJtkJvA2YTHI0naHAHcCvA1TVl5JcCtwCzABnVdWD/QldkpbNiXT6QYAtwBTw5qb8wqoq4LokK5McVlW7BxKlpIGbN7GqqlPnKD7/Eeq/E3jnUoKSpAEq4OokBfx5cz/oRFeydCcw0SwfDtzRte/eB3Yellgt5mEd6N8DLcPwMMI4PxRh20ZTW20b/J3SkjRcXlRVu5L8MHBNki93b6yqapKuni3mYR2A9120tS8PtPTjAZWFGueHImzbaGqrbb7SRpK6VNWu5nsPcDmdh3Pu2ntvafO9p6nuAzuSHsbESpIaSQ5K8oS9y8DL6TyccwWwrqm2DtjaLF8BnN48HXgMcK/3V0n7N4cCJekhE8DlSaDTP36kqj6Z5HPApUnOBG4HTm7qXwWcQOctEw8Ar1v+kCUNExMrSWpU1W3Ac+Yo/xZw7BzlBZy1DKFJGhEOBUqSJLXExEqSJKklJlaSJEktMbGSJElqiYmVJElSS0ysJEmSWmJiJUmS1BITK0mSpJaYWEmSJLXExEqSJKklJlaSJEktMbGSJElqiYmVJElSS0ysJEmSWmJiJUmS1JJ5E6skFyTZk+TmrrI/TvLlJF9McnmSlU356iTfTXJT8/mzfgYvSZI0THq5YvVh4LhZZdcAz66qnwT+DnhL17avVdXRzec32glTkiRp+M2bWFXVZ4C7Z5VdXVUzzep1wKo+xCZJkjRS2rjH6vXA/+haPzLJF5L8dZKfbeH4kiRJI2HFUnZO8nvADHBRU7QbeEpVfSvJTwF/leQnquq+OfZdD6wHmJiYYGpqat6fNz093VO9YbJhzcz8lRoTBy6sfr/04994FM9dr8a5bTD+7ZOkNi06sUpyBvBK4NiqKoCq+h7wvWb5xiRfA54O3DB7/6raDGwGWLt2bU1OTs77M6empuil3jA5Y+Mneq67Yc0Mm7YtKddtxY7TJls/5iieu16Nc9tg/NsnSW1a1FBgkuOA3wVeVVUPdJU/OckBzfJTgaOA29oIVJIkadjNe3kkycXAJHBokp3A2+g8BfgY4JokANc1TwC+GPjDJP8MfB/4jaq6e84DS5IkjZl5E6uqOnWO4vP3Ufcy4LKlBiVJkjSKnHldkiSpJSZWktQlyQHNlDFXNutHJrk+yfYklyR5dFP+mGZ9e7N99SDjljQcTKwk6eHeCNzatf4u4JyqehpwD3BmU34mcE9Tfk5TT9J+bvDP9mvorF7AFBG92rBmhsnWjyq1K8kq4BXAO4HfSefpnJcCv9xU2QK8HTgPOLFZBvgY8P4k2Tv9jKT9k1esJOkh76Ezlcz3m/UnAd/ueoXXTuDwZvlw4A6AZvu9TX1J+zGvWEkSkOSVwJ5mcuPJlo+94DdNQP/exjAMM+mP84z+tm00tdU2EytJ6ngh8KokJwCPBX4IOBdYmWRFc1VqFbCrqb8LOALYmWQF8ETgW3MdeDFvmgB430Vb+/I2hn68XWGhxnlGf9s2mtpqm0OBkgRU1VuqalVVrQZOAT5VVacBnwZe01RbB2xtlq9o1mm2f8r7qySZWEnSI3sznRvZt9O5h2rvBMnnA09qyn8H2Dig+CQNEYcCJWmWqpoCpprl24Dnz1HnH4FfXNbAJA09r1hJkiS1xMRKkiSpJSZWkiRJLTGxkiRJaomJlSRJUktMrCRJklpiYiVJktQSEytJkqSWmFhJkiS1xMRKkiSpJSZWkiRJLekpsUpyQZI9SW7uKjskyTVJvtp8H9yUJ8l7k2xP8sUkz+tX8JIkScOk1ytWHwaOm1W2Ebi2qo4CruWhN7sfDxzVfNYD5y09TEmSpOHXU2JVVZ8B7p5VfCKwpVneApzUVX5hdVwHrExyWBvBSpIkDbMVS9h3oqp2N8t3AhPN8uHAHV31djZlu7vKSLKezhUtJiYmmJqamvcHTk9P91RvmGxYM9Nz3YkDF1Z/lEwcyMidu16N4u/lQox7+ySpTUtJrP5FVVWSWuA+m4HNAGvXrq3Jycl595mamqKXesPkjI2f6LnuhjUzbNrWyikZOhvWzHDyiJ27Xo3i7+VCjHv7JKlNS3kq8K69Q3zN956mfBdwRFe9VU2ZJEnSWFtKYnUFsK5ZXgds7So/vXk68Bjg3q4hQ0mSpLHV07hTkouBSeDQJDuBtwFnA5cmORO4HTi5qX4VcAKwHXgAeF3LMUuSJA2lnhKrqjp1H5uOnaNuAWctJShJkqRR5MzrkiRJLTGxkiRJaomJlSRJUktMrCRJklpiYiVJktQSEytJaiR5bJK/TfL/knwpyR805UcmuT7J9iSXJHl0U/6YZn17s331IOOXNHgmVpL0kO8BL62q5wBHA8c1Ex2/Czinqp4G3AOc2dQ/E7inKT+nqSdpP2ZiJUmN6phuVh/VfAp4KfCxpnwLcFKzfGKzTrP92CRZpnAlDSETK0nqkuSAJDfRef/pNcDXgG9X1UxTZSdweLN8OHAHQLP9XuBJyxuxpGHS08zrkrS/qKoHgaOTrAQuB5651GMmWQ+sB5iYmGBqaqqn/SYOhA1rZuavuEC9/vx+mp6eHoo4+sG2jaa22mZiJUlzqKpvJ/k08DPAyiQrmqtSq4BdTbVdwBHAziQrgCcC35rjWJuBzQBr166tycnJnmJ430Vb2bSt/W56x2m9/fx+mpqaotd/h1Fj20ZTW21zKFCSGkme3FypIsmBwMuAW4FPA69pqq0DtjbLVzTrNNs/1bwvVdJ+yitWkvSQw4AtSQ6g84fnpVV1ZZJbgI8m+SPgC8D5Tf3zgb9Msh24GzhlEEFLGh4mVpLUqKovAs+do/w24PlzlP8j8IvLEJqkEeFQoCRJUktMrCRJklpiYiVJktQSEytJkqSWmFhJkiS1xMRKkiSpJYuebiHJM4BLuoqeCvxnYCXwa8A3mvK3VtVVi45QkiRpRCw6saqqrwBHQ+elpXRe7XA58DrgnKr6k1YilCRJGhFtDQUeC3ytqm5v6XiSJEkjp62Z108BLu5af0OS04EbgA1Vdc/sHRbztvdRfKv2Qt5M36832Q+DiQMZuXPXq1H8vVyIcW+fJLVpyYlVkkcDrwLe0hSdB7wDqOZ7E/D62fst5m3vo/hW7TM2fqLnuhvWzPTlTfbDYMOaGU4esXPXq1H8vVyIcW+fJLWpjaHA44HPV9VdAFV1V1U9WFXfBz7IHO/XkiRJGkdtJFan0jUMmOSwrm2vBm5u4WdIkiQNvSWNOyU5CHgZ8Otdxf81ydF0hgJ3zNomSZI0tpaUWFXV/cCTZpW9dkkRSZIkjajxvFNaQ2n1Am7kX4gdZ7+iL8eVJGmhfKWNJElSS0ysJEmSWmJiJUmS1BITK0mSpJaYWEmSJLXExEqSJKklJlaSJEktMbGSJElqiYmVJElSS0ysJEmSWuIrbSSpkeQI4EJggs6L5DdX1blJDgEuAVbTebn8yVV1T5IA5wInAA8AZ1TV5wcR+0L4eimpf7xiJUkPmQE2VNWzgGOAs5I8C9gIXFtVRwHXNusAxwNHNZ/1wHnLH7KkYWJiJUmNqtq994pTVX0HuBU4HDgR2NJU2wKc1CyfCFxYHdcBK5MctsxhSxoiDgVK0hySrAaeC1wPTFTV7mbTnXSGCqGTdN3RtdvOpmx3VxlJ1tO5osXExARTU1M9xTBxIGxYM7Oo+Aeh13YBTE9PL6j+KLFto6mttplYSdIsSR4PXAa8qaru69xK1VFVlaQWcryq2gxsBli7dm1NTk72tN/7LtrKpm2j003vOG2y57pTU1P0+u8wamzbaGqrbQ4FSlKXJI+ik1RdVFUfb4rv2jvE13zvacp3AUd07b6qKZO0nzKxkqRG85Tf+cCtVfXurk1XAOua5XXA1q7y09NxDHBv15ChpP3Q6FxjlqT+eyHwWmBbkpuasrcCZwOXJjkTuB04udl2FZ2pFrbTmW7hdcsbrqRhM3KJlfOvSOqXqvoskH1sPnaO+gWc1degJI2UJSdWSXYA3wEeBGaqau2+JtNb6s+SJEkaZm3dY/WSqjq6qtY26/uaTE+SJGls9evm9X1NpidJkjS22rjHqoCrm3ld/ryZr2Vfk+n9i8VMmDc9Pc2GNQ+2EPIP6teEZwuZ3G/UJgNciH62bdCT1Y3zhHkw/u2TpDa1kVi9qKp2Jflh4JokX+7euK/J9BYzYd7U1BSbPnt/CyH/oIVMbLcQZyzgZvsNa2ZGajLAhehn2/p17no1zhPmwfi3T5LatOShwKra1XzvAS4Hns++J9OTJEkaW0tKrJIclOQJe5eBlwM3s+/J9CRJksbWUsdmJoDLm/dorQA+UlWfTPI55p5MT5IkaWwtKbGqqtuA58xR/i3mmExvmPVr4lFJkrT/GM87pbVfcTZ+SdKw8CXMkiRJLTGxkiRJaomJlSRJUktMrCRJklpiYiVJktQSEytJkqSWmFhJkiS1xMRKkiSpJSZWkiRJLTGxkiRJaomJlSRJUktMrCRJklpiYiVJktQSEytJkqSWmFhJkiS1xMRKkhpJLkiyJ8nNXWWHJLkmyVeb74Ob8iR5b5LtSb6Y5HmDi1zSsDCxkqSHfBg4blbZRuDaqjoKuLZZBzgeOKr5rAfOW6YYJQ0xEytJalTVZ4C7ZxWfCGxplrcAJ3WVX1gd1wErkxy2PJFKGlYrBh2AJA25iara3SzfCUw0y4cDd3TV29mU7WaWJOvpXNViYmKCqamp3n7wgbBhzczioh6AXtsFMD09vaD6o8S2jaa22rboxCrJEcCFdDqZAjZX1blJ3g78GvCNpupbq+qqpQYqSYNWVZWkFrHfZmAzwNq1a2tycrKn/d530VY2bRudv393nDbZc92pqSl6/XcYNbZtNLXVtqX8FzsDbKiqzyd5AnBjkmuabedU1Z8sOTpJGry7khxWVbubob49Tfku4IiuequaMkn7sUXfY1VVu6vq883yd4Bb6VwGl6RxcgWwrlleB2ztKj+9eTrwGODeriFDSfupVm5eT7IaeC5wfVP0hubx4wv2PposScMuycXA/wWekWRnkjOBs4GXJfkq8HPNOsBVwG3AduCDwG8OIGRJQ2bJg/dJHg9cBrypqu5Lch7wDjr3Xb0D2AS8fo79Fnwz5/T0NBvWPLjUkIfWqN2ouhCj2LZeb2Ic55s5Yfzb162qTt3HpmPnqFvAWf2NSNKoWVJileRRdJKqi6rq4wBVdVfX9g8CV86172Ju5pyammLTZ+9fSshDbcOamZG6UXUhRrJt23r7Xduw5sEF/V7uOPsVi41oIMb5ZlVJatuihwKTBDgfuLWq3t1V3j2Py6uBm2fvK0mSNI6WcgnhhcBrgW1JbmrK3gqcmuRoOkOBO4BfX1KEkiRJI2LRiVVVfRbIHJucs0qSJO2XfKWNJElSS0ysJEmSWmJiJUmS1BITK0mSpJaYWEmSJLXExEqSJKklJlaSJEktMbGSJElqiYmVJElSS0ysJEmSWmJiJUmS1BITK0mSpJaYWEmSJLVkxaADkPY3qzd+oi/H3XH2K/pyXElS70ysJEmtWMgfDRvWzHBGj/X9o0GjxKFASZKklphYSZIktcTESpIkqSXeYyVJGmqj9sDHtl339nz/2EJ4r9lo8IqVJElSS0ysJEmSWtK3ocAkxwHnAgcAf1FVZ/frZ0nq33DJh487qC/HHRf2ddLyGva+ri+JVZIDgA8ALwN2Ap9LckVV3dKPnydJg2Bfp+U0avea7a/6NRT4fGB7Vd1WVf8EfBQ4sU8/S5IGxb5O0sOkqto/aPIa4Liq+tVm/bXAC6rqDV111gPrm9VnAF/p4dCHAt9sOdxhMs7ts22jayHt+7GqenI/gxkmvfR1Tfli+jsY798t2zaabFvHPvu6gU23UFWbgc0L2SfJDVW1tk8hDdw4t8+2ja5xb99yWEx/B+P9b2/bRpNtm1+/hgJ3AUd0ra9qyiRpnNjXSXqYfiVWnwOOSnJkkkcDpwBX9OlnSdKg2NdJepi+DAVW1UySNwD/k84jyBdU1ZdaOPSCL6WPmHFun20bXePevkXrY1+31zj/29u20WTb5tGXm9clSZL2R868LkmS1BITK0mSpJaMTGKV5LgkX0myPcnGQcezFEmOSPLpJLck+VKSNzblhyS5JslXm++DBx3rYiU5IMkXklzZrB+Z5Prm/F3S3Og7kpKsTPKxJF9OcmuSnxmXc5fkt5vfyZuTXJzkseN07kbFOPV33fbV942T2X3fOJmr7xt0TG2Zq+9b7LFGIrHqem3E8cCzgFOTPGuwUS3JDLChqp4FHAOc1bRnI3BtVR0FXNusj6o3Ard2rb8LOKeqngbcA5w5kKjacS7wyap6JvAcOu0c+XOX5HDgPwBrq+rZdG7GPoXxOndDbwz7u2776vvGyey+b5zM1feNvEfo+xZlJBIrxuy1EVW1u6o+3yx/h84v5+F02rSlqbYFOGkwES5NklXAK4C/aNYDvBT4WFNllNv2RODFwPkAVfVPVfVtxuTc0XlS+MAkK4DHAbsZk3M3Qsaqv+v2CH3fWJjd942TR+j7xsXsvu/riz3QqCRWhwN3dK3vZEz+Y0yyGngucD0wUVW7m013AhMDCmup3gP8LvD9Zv1JwLeraqZZH+XzdyTwDeBDzeX+v0hyEGNw7qpqF/AnwD/QSajuBW5kfM7dqBjb/q7brL5vXMzu+8bJvvq+kTdX31dVVy/2eKOSWI2lJI8HLgPeVFX3dW+rzjwYIzcXRpJXAnuq6sZBx9InK4DnAedV1XOB+5k17DfC5+5gOldGjgR+FDgIOG6gQWksPVLfN6rs+0bXXH1fkl9Z7PFGJbEau9dGJHkUnY7loqr6eFN8V5LDmu2HAXsGFd8SvBB4VZIddIYwXkpnXH5lc4kVRvv87QR2VtXev7I/RqezGYdz93PA31fVN6rqn4GP0zmf43LuRsXY9Xfd9tH3jYMf6PuS/LfBhtSqffV942Cuvu/fLPZgo5JYjdVrI5p7js4Hbq2qd3dtugJY1yyvA7Yud2xLVVVvqapVVbWaznn6VFWdBnwaeE1TbSTbBlBVdwJ3JHlGU3QscAtjcO7oXAY/Jsnjmt/RvW0bi3M3Qsaqv+v2CH3fyNtH37foqx7D5hH6vnEwV9+36BvzR2bm9SQn0Bm/3vvaiHcOOKRFS/Ii4G+AbTw0Fv9WOvcaXAo8BbgdOLmq7h5IkC1IMgn8x6p6ZZKn0vkr7hDgC8CvVNX3BhnfYiU5ms7NqY8GbgNeR+ePlJE/d0n+APglOk9vfQH4VTr394zFuRsV49TfddtX31dVVw0uqvZ1932DjqVNc/V9VXXPYKNqx1x932L7uZFJrCRJkobdqAwFSpIkDT0TK0mSpJaYWEmSJLXExEqSJKklJlaSJEktMbGSJElqiYmVJElSS/4/M2B/vE9XaY0AAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 720x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     }
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "source": [
    "# 归一化\n",
    "scaler = preprocessing.MinMaxScaler()\n",
    "xtrain = scaler.fit_transform(train[['Age','SibSp']])\n",
    "pd.DataFrame(xtrain).hist(figsize=(10,4))"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f9113f23b70>,\n",
       "        <matplotlib.axes._subplots.AxesSubplot object at 0x7f91142feac8>]],\n",
       "      dtype=object)"
      ]
     },
     "metadata": {},
     "execution_count": 10
    },
    {
     "output_type": "display_data",
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlYAAAEICAYAAACdyboFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAcWklEQVR4nO3df7DddX3n8eerpCoNalT0DgO0wWl0V2VFvGPp2HWvojZix9hd60JRQFmjFjvtNDNrtJ3V1XYGd4sUqUsbFybBoQirpckourLIXdZOQUGRoPgj0DgkjaQFjAYtNfreP843cnK9Iefe+/3ec869z8fMmfP9fr6f7/e87snNZ973+zNVhSRJkhbu54YdQJIkaamwsJIkSWqJhZUkSVJLLKwkSZJaYmElSZLUEgsrSZKkllhYSZIktcTCSp1L8tQk1yV5OMm3k/z2sDNJUtuSvCPJbUkeSbJ52Hk0HCuGHUDLwoeBfwEmgFOATyX5SlV9dbixJKlV/wD8MfDrwNFDzqIhiXdeV5eSrAQeAp5XVd9s2j4K7K6qjUMNJ0kdSPLHwAlVdd6ws2jxeShQXXsWcOBgUdX4CvDcIeWRJKkzFlbq2jHA92a07QOeOIQskiR1ysJKXdsPPGlG25OA7w8hiyRJnbKwUte+CaxIsqav7fmAJ65LkpYcCyt1qqoeBv4aeF+SlUleDKwDPjrcZJLUriQrkjwBOAo4KskTknj1/TJjYaXF8Dv0Lj3eC1wNvN1bLUhagv4I+CGwEXhDM/1HQ02kReftFiRJklriHitJkqSWWFhJkiS1xMJKkiSpJRZWkiRJLRmJy0CPPfbYWr169UB9H374YVauXNltoBaNW14Yv8zm7dZc8t5+++3/VFVP7zjSWHO8Gx3m7dZSzvuYY11VDf31whe+sAZ10003Ddx3FIxb3qrxy2zebs0lL3BbjcCYMsovx7vRYd5uLeW8jzXWeShQkiSpJRZWkiRJLbGwkiRJaomFlSRJUkssrCRJklpiYSVJktQSCytJkqSWWFhJkiS15IiFVZIrkuxNcldf2zVJ7mheO5Pc0bSvTvLDvmV/0WV4SWpbklVJPp7k60nuTvKrSZ6a5IYk32ren9L0TZIPJdmR5M4kpw47v6ThGuSRNpuBPweuPNhQVf/x4HSSi4B9ff3vqapT2go47rbv3sd5Gz/VybZ3XvjqTrYrLXOXAJ+pqtcleRzwC8C7gRur6sIkG4GNwDuBVwFrmtevAJc1763oavxw7JC6c8Q9VlV1M/DgbMuSBHg9cHXLuSRp0SV5MvAS4HKAqvqXqvousA7Y0nTbAry2mV4HXNk85eIWYFWS4xY5tqQRkt4jb47QKVkNfLKqnjej/SXAB6tqsq/fV4FvAt8D/qiq/t9htrkeWA8wMTHxwo997GMDBd6/fz/HHHPMQH1Hwd4H93H/D7vZ9snHP7mT7Y7bd2zebs0l70tf+tLbD44H4yjJKcAm4GvA84Hbgd8DdlfVqqZPgIeqalWSTwIXVtXnm2U3Au+sqttmbHde411X44djR495u7WU8z7WWDfIocDHchaH7q3aA/xiVT2Q5IXA3yR5blV9b+aKVbWJ3gDG5ORkTU1NDfSB09PTDNp3FFx61VYu2r7Qr3l2O8+e6mS74/Ydm7db45Z3gVYApwK/W1W3JrmE3mG/n6qqSnLkv0gPXWde411X44djR495u7Vc8877qsAkK4B/D1xzsK2qHqmqB5rp24F7gGctNKQkLZJdwK6qurWZ/zi9Quv+g4f4mve9zfLdwIl965/QtElaphZyu4WXA1+vql0HG5I8PclRzfQz6Z3Qee/CIkrS4qiq7wD3JXl203Q6vcOC24Bzm7Zzga3N9DbgnObqwNOAfVW1ZzEzSxotR9zHnORqYAo4Nsku4D1VdTlwJj970vpLgPcl+RHwE+BtVTXrie+SNKJ+F7iquSLwXuBN9P4IvTbJ+cC36V20A3A9cAawA/hB01fSMnbEwqqqzjpM+3mztH0C+MTCY0nScFTVHcBsJ6WePkvfAi7oPJSkseGd1yVJklrSzeVqWhSrO7rx6Oa1KzvZriRJS517rCRJklpiYSVJktQSCytJkqSWWFhJkiS1xMJKkiSpJRZWkiRJLbGwkiRJaomFlSRJUkssrCRJklpiYSVJktQSCytJkqSWWFhJkiS1xMJKkiSpJRZWkiRJLbGwkiRJaomFlSRJUkssrCRJklpiYSVJktSSIxZWSa5IsjfJXX1t702yO8kdzeuMvmXvSrIjyTeS/HpXwSVJkkbNIHusNgNrZ2m/uKpOaV7XAyR5DnAm8Nxmnf+R5Ki2wkqSJI2yIxZWVXUz8OCA21sHfKyqHqmqvwd2AC9aQD5JkqSxsWIB674jyTnAbcCGqnoIOB64pa/PrqbtZyRZD6wHmJiYYHp6eqAP3b9//8B9R8HE0bDh5APDjjEn4/Ydm7db45ZXkoZpvoXVZcD7gWreLwLePJcNVNUmYBPA5ORkTU1NDbTe9PQ0g/YdBZdetZWLti+kfl18m9euHKvveNx+J8wrSUvXvK4KrKr7q+rHVfUT4CM8erhvN3BiX9cTmjZJGgtJdibZ3lyYc1vT9tQkNyT5VvP+lKY9ST7UXLBzZ5JTh5te0rDNq7BKclzf7G8CB68Y3AacmeTxSU4C1gBfWFhESVp0L20uzJls5jcCN1bVGuDGZh7gVfTGuTX0Tm24bNGTShopRzxGleRqYAo4Nsku4D3AVJJT6B0K3Am8FaCqvprkWuBrwAHggqr6cTfRJWnRrKM3DgJsAaaBdzbtV1ZVAbckWZXkuKraM5SUkobuiIVVVZ01S/Plj9H/T4A/WUgoSRqiAj6bpIC/bM4Hnegrlr4DTDTTxwP39a178IKdQwqr+V6s09XFL11djDBuFzqYt1vLNe94nVUtSd37taraneQZwA1Jvt6/sKqqKboGNt+Ldbq6+GXn2YN9/lyN24UO5u3Wcs3rI20kqU9V7W7e9wLX0bs45/6D55Y273ub7l6wI+kQFlaS1EiyMskTD04Dr6R3cc424Nym27nA1mZ6G3BOc3XgacA+z6+SljcPBUrSoyaA65JAb3z8q6r6TJIvAtcmOR/4NvD6pv/1wBn0njLxA+BNix9Z0iixsJKkRlXdCzx/lvYHgNNnaS/ggkWIJmlMeChQkiSpJRZWkiRJLbGwkiRJaomFlSRJUkssrCRJklpiYSVJktQSCytJkqSWWFhJkiS1xMJKkiSpJRZWkiRJLbGwkiRJaomFlSRJUkssrCRJklpiYSVJktQSCytJkqSWHLGwSnJFkr1J7upr++9Jvp7kziTXJVnVtK9O8sMkdzSvv+gyvCRJ0igZZI/VZmDtjLYbgOdV1b8Bvgm8q2/ZPVV1SvN6WzsxJUmSRt8RC6uquhl4cEbbZ6vqQDN7C3BCB9kkSZLGShvnWL0Z+HTf/ElJvpzk/yb5ty1sX5IkaSysWMjKSf4QOABc1TTtAX6xqh5I8kLgb5I8t6q+N8u664H1ABMTE0xPTw/0mfv37x+47yiYOBo2nHzgyB1HyLh9x+bt1rjllaRhmndhleQ84DeA06uqAKrqEeCRZvr2JPcAzwJum7l+VW0CNgFMTk7W1NTUQJ87PT3NoH1HwaVXbeWi7QuqXxfd5rUrx+o7HrffCfNK0tI1r0OBSdYC/xl4TVX9oK/96UmOaqafCawB7m0jqCRJ0qg74q6UJFcDU8CxSXYB76F3FeDjgRuSANzSXAH4EuB9SX4E/AR4W1U9OOuGJUmSlpgjFlZVddYszZcfpu8ngE8sNJQkSdI48s7rkiRJLbGwkqQ+SY5qbhnzyWb+pCS3JtmR5Jokj2vaH9/M72iWrx5mbkmjwcJKkg71e8DdffMfAC6uql8GHgLOb9rPBx5q2i9u+kla5sbrPgBaFNt37+O8jZ9qfbs7L3x169uU2pTkBODVwJ8Af5De1TkvA3676bIFeC9wGbCumQb4OPDnSXLw9jOSlif3WEnSo/6M3q1kftLMPw34bt8jvHYBxzfTxwP3ATTL9zX9JS1j7rGSJCDJbwB7m5sbT7W87Xk9aaKrJzd0dSf9cbtLv3m7tVzzWlhJUs+LgdckOQN4AvAk4BJgVZIVzV6pE4DdTf/dwInAriQrgCcDD8y24fk+aaKrJzfsPHuwz5+rcbtLv3m7tVzzeihQkoCqeldVnVBVq4Ezgc9V1dnATcDrmm7nAlub6W3NPM3yz3l+lSQLK0l6bO+kdyL7DnrnUB28QfLlwNOa9j8ANg4pn6QR4qFASZqhqqaB6Wb6XuBFs/T5Z+C3FjWYpJHnHitJkqSWWFhJkiS1xMJKkiSpJRZWkiRJLbGwkiRJaomFlSRJUkssrCRJklpiYSVJktQSCytJkqSWWFhJkiS1xMJKkiSpJQMVVkmuSLI3yV19bU9NckOSbzXvT2nak+RDSXYkuTPJqV2FlyRJGiWD7rHaDKyd0bYRuLGq1gA38uiT3V8FrGle64HLFh5TkiRp9A1UWFXVzcCDM5rXAVua6S3Aa/var6yeW4BVSY5rI6wkSdIoW7GAdSeqak8z/R1gopk+Hrivr9+upm1PXxtJ1tPbo8XExATT09MDfej+/fsH7jsKJo6GDScfGHaMOekqc1f/buP2O2FeSVq6FlJY/VRVVZKa4zqbgE0Ak5OTNTU1NdB609PTDNp3FFx61VYu2t7K17xoNpx8oJPMO8+ean2bMH6/E+aVpKVrIVcF3n/wEF/zvrdp3w2c2NfvhKZNkiRpSVtIYbUNOLeZPhfY2td+TnN14GnAvr5DhpIkSUvWQMd7klwNTAHHJtkFvAe4ELg2yfnAt4HXN92vB84AdgA/AN7UcmZJkqSRNFBhVVVnHWbR6bP0LeCChYSSJEkaR955XZIkqSUWVpIkSS2xsJIkSWqJhZUkSVJLLKwkSZJaYmElSY0kT0jyhSRfSfLVJP+1aT8pya1JdiS5JsnjmvbHN/M7muWrh5lf0vBZWEnSox4BXlZVzwdOAdY2Nzr+AHBxVf0y8BBwftP/fOChpv3ipp+kZczCSpIa1bO/mf355lXAy4CPN+1bgNc20+uaeZrlpyfJIsWVNIIsrCSpT5KjktxB7/mnNwD3AN+tqgNNl13A8c308cB9AM3yfcDTFjexpFEy0J3XJWm5qKofA6ckWQVcB/yrhW4zyXpgPcDExATT09MDrTdxNGw4+cCRO87RoJ8/V/v37+9s210wb7eWa14LK0maRVV9N8lNwK8Cq5KsaPZKnQDsbrrtBk4EdiVZATwZeGCWbW0CNgFMTk7W1NTUQBkuvWorF21vf5jeefZgnz9X09PTDPqzjQLzdmu55vVQoCQ1kjy92VNFkqOBVwB3AzcBr2u6nQtsbaa3NfM0yz/XPC9V0jLlHitJetRxwJYkR9H7w/Paqvpkkq8BH0vyx8CXgcub/pcDH02yA3gQOHMYoSWNDgsrSWpU1Z3AC2Zpvxd40Szt/wz81iJEkzQmPBQoSZLUEgsrSZKkllhYSZIktcTCSpIkqSUWVpIkSS2xsJIkSWrJvG+3kOTZwDV9Tc8E/guwCngL8I9N+7ur6vp5J5QkSRoT8y6squobwCnQe2gpvUc7XAe8Cbi4qv60lYSSJEljoq1DgacD91TVt1vaniRJ0thp687rZwJX982/I8k5wG3Ahqp6aOYK833a+7g9Lburp9N3qavMXf27jdvvhHklaelacGGV5HHAa4B3NU2XAe8Hqnm/CHjzzPXm+7T3cXtadldPp+/ShpMPdJJ559lTrW8Txu93wryStHS1cSjwVcCXqup+gKq6v6p+XFU/AT7CLM/XkiRJWoraKKzOou8wYJLj+pb9JnBXC58hSZI08hZ0vCfJSuAVwFv7mv9bklPoHQrcOWOZJEnSkrWgwqqqHgaeNqPtjQtKJEmSNKbG66xqjbXVGz/VyXY3r13ZyXYlSZorH2kjSZLUEgsrSZKkllhYSZIktcTCSpIkqSUWVpIkSS2xsJIkSWqJhZUkSVJLLKwkSZJaYmElSZLUEgsrSZKklvhIG0lqJDkRuBKYoPcg+U1VdUmSpwLXAKvpPVz+9VX1UJIAlwBnAD8AzquqLw0j+1z4eCmpO+6xkqRHHQA2VNVzgNOAC5I8B9gI3FhVa4Abm3mAVwFrmtd64LLFjyxplFhYSVKjqvYc3ONUVd8H7gaOB9YBW5puW4DXNtPrgCur5xZgVZLjFjm2pBHioUBJmkWS1cALgFuBiara0yz6Dr1DhdAruu7rW21X07anr40k6+nt0WJiYoLp6emBMkwcDRtOPjCv/MOwf//+gX+2UWDebi3XvBZWkjRDkmOATwC/X1Xf651K1VNVlaTmsr2q2gRsApicnKypqamB1rv0qq1ctH18hunNa1cy6M82Cqanp83boeWa10OBktQnyc/TK6quqqq/bprvP3iIr3nf27TvBk7sW/2Epk3SMmVhJUmN5iq/y4G7q+qDfYu2Aec20+cCW/vaz0nPacC+vkOGkpah8dnHLEndezHwRmB7kjuatncDFwLXJjkf+Dbw+mbZ9fRutbCD3u0W3rS4cSWNmrErrLbv3sd5HdyDZeeFr259m5LGS1V9HshhFp8+S/8CLug0lKSxsuDCKslO4PvAj4EDVTV5uJvpLfSzJEmSRllb51i9tKpOqarJZv5wN9OTJElasro6ef1wN9OTJElasto4x6qAzzb3dfnL5n4th7uZ3k+N2g3zurqJ2bjd4A/GL/NyvQndYhm3vJI0TG0UVr9WVbuTPAO4IcnX+xce7mZ6o3bDvJ1nD/b5czVuN/iDXlE1Tpm9KWG3xi2vJA3Tgg8FVtXu5n0vcB3wIg5/Mz1JkqQla0GFVZKVSZ54cBp4JXAXh7+ZniRJ0pK10OM9E8B1zXO0VgB/VVWfSfJFZr+ZniRJ0pK1oMKqqu4Fnj9L+wPMcjO9Uba6g5uOAmw4uZPNSpKkETQ+ZyhLh+Hd+CVJo8KHMEuSJLXEwkqSJKklFlaSJEktsbCSJElqiYWVJElSSyysJEmSWmJhJUmS1BILK0mSpJZYWEmSJLXEwkqSJKklFlaSJEktsbCSJElqiYWVJElSSyysJEmSWmJhJUmS1BILK0lqJLkiyd4kd/W1PTXJDUm+1bw/pWlPkg8l2ZHkziSnDi+5pFFhYSVJj9oMrJ3RthG4sarWADc28wCvAtY0r/XAZYuUUdIIs7CSpEZV3Qw8OKN5HbClmd4CvLav/crquQVYleS4xUkqaVStGHYASRpxE1W1p5n+DjDRTB8P3NfXb1fTtocZkqynt1eLiYkJpqenB/vgo2HDyQfml3oI9u/fP/DPNgrM263lmnfehVWSE4Er6Q0yBWyqqkuSvBd4C/CPTdd3V9X1Cw0qScNWVZWk5rHeJmATwOTkZE1NTQ203qVXbeWi7ePz9+/mtSsZ9GcbBdPT0+bt0HLNu5D/sQeADVX1pSRPBG5PckOz7OKq+tMFp5Ok4bs/yXFVtac51Le3ad8NnNjX74SmTdIyNu9zrKpqT1V9qZn+PnA3vd3gkrSUbAPObabPBbb2tZ/TXB14GrCv75ChpGWqlZPXk6wGXgDc2jS9o7n8+IqDlyZL0qhLcjXwd8Czk+xKcj5wIfCKJN8CXt7MA1wP3AvsAD4C/M4QIksaMQs+eJ/kGOATwO9X1feSXAa8n955V+8HLgLePMt6y+JkznHLC+OXuau8XZ10uVxP6BwHVXXWYRadPkvfAi7oNpGkcbOgwirJz9Mrqq6qqr8GqKr7+5Z/BPjkbOsul5M5N5x8YKzywvhl7izv9ofb3yawee0xy/KETklaDuZ9KDBJgMuBu6vqg33t/fdx+U3grpnrSpIkLUUL+TP/xcAbge1J7mja3g2cleQUeocCdwJvXVBCSZKkMTHvwqqqPg9klkXes0qSJC1LPtJGkiSpJRZWkiRJLbGwkiRJaomFlSRJUkssrCRJklpiYSVJktQSCytJkqSWWFhJkiS1xMJKkiSpJRZWkiRJLbGwkiRJaomFlSRJUkssrCRJklqyYtgBpOVm++59nLfxU61vd+eFr259m5KkubGwkiS1wj8aJA8FSpIktcbCSpIkqSUWVpIkSS3xHCtJ0khb3cF5WwCb167sZLvjllftco+VJElSSyysJEmSWtLZocAka4FLgKOA/1lVF3b1WZI8/DAsjnXS4hr1sa6TwirJUcCHgVcAu4AvJtlWVV/r4vMkaRgc67SYvE/YeOjqUOCLgB1VdW9V/QvwMWBdR58lScPiWCfpEKmq9jeavA5YW1X/qZl/I/ArVfWOvj7rgfXN7LOBbwy4+WOBf2oxbtfGLS+MX2bzdmsueX+pqp7eZZhRMshY17Q73o0m83ZrKec97Fg3tNstVNUmYNNc10tyW1VNdhCpE+OWF8Yvs3m7NW55R5Hj3Wgyb7eWa96uDgXuBk7smz+haZOkpcSxTtIhuiqsvgisSXJSkscBZwLbOvosSRoWxzpJh+jkUGBVHUjyDuB/07sE+Yqq+mpLm5/z7vQhG7e8MH6Zzdutccu7aDoe62D8vnvzdsu83Wolbycnr0uSJC1H3nldkiSpJRZWkiRJLRnZwirJ2iTfSLIjycZZlj8+yTXN8luTrF78lIfkOVLeP0jytSR3JrkxyS8NI2dfnsfM29fvPySpJEO9ZHaQvEle33zHX03yV4udcZY8R/qd+MUkNyX5cvN7ccYwcjZZrkiyN8ldh1meJB9qfpY7k5y62BmXqnEb65pMjncdGrfxzrFuhqoauRe9k0DvAZ4JPA74CvCcGX1+B/iLZvpM4JoRz/tS4Bea6bePet6m3xOBm4FbgMlRzgusAb4MPKWZf8aw8s4h8ybg7c30c4CdQ8z7EuBU4K7DLD8D+DQQ4DTg1mF+v0vlNW5j3RwyO951+/2OzHjnWPezr1HdYzXIYyLWAVua6Y8DpyfJImbsd8S8VXVTVf2gmb2F3v1uhmXQx3C8H/gA8M+LGW4Wg+R9C/DhqnoIoKr2LnLGmQbJXMCTmuknA/+wiPkODVJ1M/DgY3RZB1xZPbcAq5IctzjplrRxG+vA8a5r4zbeOdbNMKqF1fHAfX3zu5q2WftU1QFgH/C0RUn3swbJ2+98ehXxsBwxb7P788Sq6uYx4nMzyPf7LOBZSf42yS1J1i5autkNkvm9wBuS7AKuB353caLNy1x/xzWYcRvrDsnTcLxr17iNd451MwztkTbLVZI3AJPAvxt2lsNJ8nPAB4HzhhxlLlbQ2z0+Re+v45uTnFxV3x1qqsd2FrC5qi5K8qvAR5M8r6p+MuxgUhsc7zozbuPdshrrRnWP1SCPifhpnyQr6O1efGBR0v2sgR5rkeTlwB8Cr6mqRxYp22yOlPeJwPOA6SQ76R1n3jbEEzoH+X53Aduq6kdV9ffAN+kNPMMySObzgWsBqurvgCfQewjoKPLRLd0Yt7HukDwNx7t2jdt451g307BOIDvCyWUrgHuBk3j0ZLjnzuhzAYee0HntiOd9Ab0T/NaMw/c7o/80wz2Zc5Dvdy2wpZk+lt6u3KeNeOZPA+c10/+a3nkHGWLm1Rz+hM5Xc+gJnV8YVs6l9Bq3sW4OmR3vuv1+R2a8c6ybZRvD+sEG+MHPoFeF3wP8YdP2Pnp//UCv4v1fwA7gC8AzRzzv/wHuB+5oXttGOe+MvkMdaAb8fkNvd/7XgO3AmcPMO2Dm5wB/2wxEdwCvHGLWq4E9wI/o/TV8PvA24G193++Hm59l+7B/H5bSa9zGugEzO951+/2O1HjnWHfoy0faSJIktWRUz7GSJEkaOxZWkiRJLbGwkiRJaomFlSRJUkssrCRJklpiYSVJktQSCytJkqSW/H/aYKEMVV48XgAAAABJRU5ErkJggg==",
      "text/plain": [
       "<Figure size 720x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     }
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "### 其他预处理方法"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "source": [
    "#### 线性函数归一化 Min-Max Scaling\n",
    "\n",
    "import numpy as np\n",
    "x = np.array([[ 0., -3.,  1.], [ 3.,  1.,  2.], [ 0.,  1., -1.]]) # 初始化数据，每一行表示一个样本，每一列表示一个特征\n",
    "min_max_scaler = preprocessing.MinMaxScaler() # 将数据进行 min-max 规范化\n",
    "minmax_x = min_max_scaler.fit_transform(x)\n",
    "print(minmax_x)"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "[[0.         0.         0.66666667]\n",
      " [1.         1.         1.        ]\n",
      " [0.         1.         0.        ]]\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "source": [
    "#### 零均值归一化 Z-score\n",
    "x = np.array([[ 0., -3.,  1.], [ 3.,  1.,  2.], [ 0.,  1., -1.]]) # 初始化数据\n",
    "standard_scaled_scaler = preprocessing.StandardScaler() # 将数据进行 StandardScaler 规范化\n",
    "standard_scaled_x = standard_scaled_scaler.fit_transform(x)\n",
    "print(standard_scaled_x) # 以下可见每个特征（列）均值为 0 "
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "[[-0.70710678 -1.41421356  0.26726124]\n",
      " [ 1.41421356  0.70710678  1.06904497]\n",
      " [-0.70710678  0.70710678 -1.33630621]]\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "source": [
    "### Outliers\n",
    "#### Winsorization\n",
    "pd.DataFrame(x).hist(bins=30)"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f9114719278>,\n",
       "        <matplotlib.axes._subplots.AxesSubplot object at 0x7f91147aa390>],\n",
       "       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f91147d6898>,\n",
       "        <matplotlib.axes._subplots.AxesSubplot object at 0x7f9114809e48>]],\n",
       "      dtype=object)"
      ]
     },
     "metadata": {},
     "execution_count": 14
    },
    {
     "output_type": "display_data",
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEICAYAAABPgw/pAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAQWUlEQVR4nO3df6jd9X3H8deriVa50ZQtcisxeIVZmVPWzot2CN2NtVu0xewn6Dq3bI4rW2V2CFuKslHpoDJ0MCcrYYa0kpnKVJI1Fmepd1JQ541EY5IqmURMJs1suuhNRXe39/64X/Xm9txzTrzfc76f73vPB1w43/P9+snrfvPx5dfvj3McEQIAtN+Hmg4AAKgHhQ4ASVDoAJAEhQ4ASVDoAJAEhQ4ASVDoAJAEhV4g2z9l+2Hbx22/Yvu3m84EDJLtm2xP237b9pam87TV8qYDoKN7JL0jaVTSxyXttP1cROxtNhYwMP8h6SuSfkXS6Q1naS3zpGhZbI9I+pGkiyLipeq9+yQdjoiNjYYDBsz2VySdExEbms7SRpxyKc/HJM2+W+aV5yT9XEN5ALQEhV6eFZLeWPDeMUlnNJAFQItQ6OWZkXTmgvfOlPRmA1kAtAiFXp6XJC23ff68935eEhdEAXRFoRcmIo5LekjS7bZHbF8uab2k+5pNBgyO7eW2T5O0TNIy26fZ5i68k0Shl+mPNXfr1hFJ90v6I25ZRHK3SXpL0kZJv1O9vq3RRC3EbYsAkARH6ACQBIUOAElQ6ACQBIUOAEk0dlvQqlWrYmxsrOO648ePa2RkZLiBloC8g9Ut765du16PiLOGHOkD6TbnS9C2eTEope+HbnO+sUIfGxvT9PR0x3VTU1OamJgYbqAlIO9gdctr+5Xhpvngus35ErRtXgxK6fuh25yv5ZSL7TW2H7e9z/Ze2zfXMS5QMuY9SlPXEfqspFsi4lnbZ0jaZfuxiNhX0/hAiZj3KEotR+gR8VpEPFu9flPSfkmr6xgbKBXzHqWp/UlR22OSntDcFzS8sWDdpKRJSRodHb1k27ZtHcc4cvSYfvDW+8sXr15Za8a6zczMaMWKFU3H6FumvGvXrt0VEeNDjvQTFpv3/c75ErRtXgxKE/thz+FjJyx367xuc77WQre9QtK/SvqriHio27bj4+Ox2AWiu7du15173j8bdPCrn60t4yCUfhFloUx5bTde6P3O+25zvgRtmxeD0sR+GNu484Tlbp3Xbc7Xdh+67VMkPShpa68yB7Jg3qMkdd3lYkn3StofEXfVMSZQOuY9SlPXEfrlkq6XdIXt3dXP1TWNDZSKeY+i1HLbYkR8T5LrGAtoC+Y9SsNnuQBAEhQ6ACRBoQNAEhQ6ACRBoQNAEhQ6ACRBoQNAEhQ6ACRBoQNAEhQ6ACRBoQNAEhQ6ACRBoQNAEhQ6ACRBoQNAEhQ6ACRBoQNAEhQ6ACRBoQNAEhQ6ACRBoQNAEhQ6ACRBoQNAEhQ6ACRBoQNAEhQ6ACRBoQNAEhQ6ACRBoQNAEhQ6ACRBoQNAEhQ6ACRBoQNAEhQ6ACRBoQNAEhQ6ACRRW6Hb3mz7iO0X6hoTKBlzHqWp8wh9i6R1NY4HlG6LmPMoSG2FHhFPSDpa13hA6ZjzKI0jor7B7DFJ34qIixZZPylpUpJGR0cv2bZtW8dxjhw9ph+89f7yxatX1pZxEGZmZrRixYqmY/St9Lx7Dh87Yfm8lcsWzbt27dpdETE+jFyd1DXnF/7OTcz50ufFsDSxH07m77/bnB9qoc83Pj4e09PTHdfdvXW77tyz/L3lg1/9bE0JB2NqakoTExNNx+hb6XnHNu48YXnLupFF89ouutDn6zbnF/7OTcz50ufFsDSxH07m77/bnOcuFwBIgkIHgCTqvG3xfklPSrrA9iHbN9Q1NlAi5jxKs7z3Jv2JiOvqGgtoA+Y8SsMpFwBIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCRqK3Tb62y/aPuA7Y11jQuUjHmPktRS6LaXSbpH0lWSLpR0ne0L6xgbKBXzHqWp6wj9UkkHIuLliHhH0jZJ62saGygV8x5FWV7TOKslvTpv+ZCkyxZuZHtS0mS1OGP7xUXGWyXp9ff+uTtqSjk4J+RtgVblXXtH17znDjPLAj3n/UnM+RM0NOdbNS8GqPH90OPvf9E5X1eh9yUiNkna1Gs729MRMT6ESLUg72C1Le98/c75ErR5P9epzfuhrlMuhyWtmbd8TvUekBnzHkWpq9CfkXS+7fNsnyrpWkk7ahobKBXzHkWp5ZRLRMzavknSo5KWSdocEXuXMGQr/hd1HvIOVpF5BzDvm1bkfm5Aa/eDI6LpDACAGvCkKAAkQaEDQBJFFXrbHqO2vdn2EdsvNJ2lF9trbD9ue5/tvbZvbjpTL7ZPs/1vtp+rMn+56UzZ2f5r29+3/bzth21/pOlMw9S2DlqomHPo1WPUL0n6jOYe0HhG0nURsa/RYF3Y/pSkGUnfiIiLms7Tje2zJZ0dEc/aPkPSLkm/Wvj+taSRiJixfYqk70m6OSKeajhaWrZ/WdJ3qwu+d0hSRPx5w7GGoo0dtFBJR+ite4w6Ip6QdLTpHP2IiNci4tnq9ZuS9mvuScdixZyZavGU6qeMI5CkIuJfImK2WnxKc/fW/3/Rug5aqKRC7/QYddGF01a2xyR9QtLTzSbpzfYy27slHZH0WEQUnzmRP5D07aZDDFHrO2ioj/6jebZXSHpQ0hcj4o2m8/QSEf8j6ePVudyHbV8UEcVfsyiZ7e9I+miHVbdGxPZqm1slzUraOsxsWJqSCp3HqAesOg/9oKStEfFQ03lORkT8l+3HJa2TRKEvQURc2W297Q2SPifp01HKRbbhaH0HlXTKhceoJdn+sO17bb9i+03bu21fVcO4lnSvpP0RcdfSkw6e7bPevcvC9umau1j1/WZT5WZ7naQ/k3RNRPy46TxD1voOKqbQqwsx7z5GvV/SA6U/Rm37fklPSrrA9iHbN9Qw7HLNncf7JUkrJd0m6YHqvPdSXC7peklXVP+R2G376iWOOWhnS3rc9vOa+5ftsYj4VsOZsvs7SWdIeqyaI19rOtCwtLGDFirmtkUsriq0L0fEg01nAVCuYo7Q0ZntUUkfk9SqIwUAw8cResGqi5jflvTvEXFj03kAlI1CL5TtD0n6R0lnSlofEf/dcCQAhSvptkVU5t2RMirpasocQD8o9DL9vaSflXRlRLzVdBgA7cApl8LYPlfSQUlva+5JvXfdGBE8tQdgURQ6ACTBbYsAkASFDgBJUOgAkASFDgBJNHbb4qpVq2JsbKzjuuPHj2tkZGS4gZaAvIPVLe+uXbtej4izhhwJKFJjhT42Nqbp6emO66ampjQxMTHcQEtA3sHqltf2K8NNA5Sr5ymXXt9s7zl/W31L9vO2f6H+mACAXvo5h75Fc98Ss5irJJ1f/Uxq7ilHAMCQ9Sz0Pr7Zfr2kb1Tf0P6UpI/YPruugACA/tRxDn2xb8p+beGGtic1dxSv0dFRTU1NdRzwyNFjunvr9veWL169soaYgzMzM7Po71Ki0vfvnsPHTlg+b+WyVu1foClDvSgaEZskbZKk8fHxWOxC191bt+vOPe9HO/j5ztuVom0XGUvfvxs27jxhecu6kVbtX6ApddyH3vpvygaADOoo9B2Sfre62+WTko5FxE+cbgEADFbPUy7VN9tPSFpl+5Ckv5R0iiRFxNckPSLpakkHJP1Y0u8PKiwAYHE9Cz0iruuxPiR9obZEAIAPhM9yAYAkKHQASIJCB4AkKHQASIJCB4AkKHQASIJCB4AkKHQASIJCB4AkKHQASIJCB4AkKHQASIJCB4AkKHQASIJCB4AkKHQASIJCB4AkKHQASIJCB4AkKHQASIJCB4AkKHQASIJCB4AkKHQASIJCB4AkKHQASIJCB4AkKHQASIJCB4AkKHQASIJCB4AkKHQASIJCB4AkKHQASIJCB4AkKHQASKKvQre9zvaLtg/Y3thh/Qbb/2l7d/Xzh/VHBQB0s7zXBraXSbpH0mckHZL0jO0dEbFvwabfjIibBpARANCHfo7QL5V0ICJejoh3JG2TtH6wsQAAJ6vnEbqk1ZJenbd8SNJlHbb7DdufkvSSpD+NiFcXbmB7UtKkJI2OjmpqaqrjHzh6unTLxbPvLS+2XSlmZmaKzzhf6ft3fjapffsXaEo/hd6Pf5Z0f0S8bftGSV+XdMXCjSJik6RNkjQ+Ph4TExMdB7t763bduef9aAc/33m7UkxNTWmx36VEpe/fDRt3nrC8Zd1Iq/Yv0JR+TrkclrRm3vI51XvviYgfRsTb1eI/SLqknngAgH71U+jPSDrf9nm2T5V0raQd8zewffa8xWsk7a8vIgCgHz1PuUTErO2bJD0qaZmkzRGx1/btkqYjYoekP7F9jaRZSUclbRhgZgBAB32dQ4+IRyQ9suC9v5j3+kuSvlRvNADAyeBJUQBIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCQodABIgkIHgCT6KnTb62y/aPuA7Y0d1n/Y9jer9U/bHqs7KACgu56FbnuZpHskXSXpQknX2b5wwWY3SPpRRPyMpL+RdEfdQQEA3fVzhH6ppAMR8XJEvCNpm6T1C7ZZL+nr1et/kvRp264vJgCgl+V9bLNa0qvzlg9JumyxbSJi1vYxST8t6fX5G9melDRZLc7YfnGRP3PV/H/W5R/vn5C3BVq1f9fe0XX/njvMLEDJ+in02kTEJkmbem1nezoixocQqRbkHay25QWa0s8pl8OS1sxbPqd6r+M2tpdLWinph3UEBAD0p59Cf0bS+bbPs32qpGsl7ViwzQ5Jv1e9/k1J342IqC8mAKCXnqdcqnPiN0l6VNIySZsjYq/t2yVNR8QOSfdKus/2AUlHNVf6S9HztExhyDtYbcsLNMIcSANADjwpCgBJUOgAkESRhW77t2zvtf2/tou9Xa3XRyKUxvZm20dsv9B0ln7YXmP7cdv7qvlwc9OZgJIVWeiSXpD065KeaDrIYvr8SITSbJG0rukQJ2FW0i0RcaGkT0r6Qgv2MdCYIgs9IvZHxGJPkZain49EKEpEPKG5u5BaISJei4hnq9dvStqvuaeSAXRQZKG3RKePRKBsBqT6BM9PSHq62SRAuYb66P98tr8j6aMdVt0aEduHnQflsr1C0oOSvhgRbzSdByhVY4UeEVc29WfXpJ+PRMAS2T5Fc2W+NSIeajoPUDJOuXxw/XwkApag+gjmeyXtj4i7ms4DlK7IQrf9a7YPSfpFSTttP9p0poUiYlbSux+JsF/SAxGxt9lU3dm+X9KTki6wfcj2DU1n6uFySddLusL27urn6qZDAaXi0X8ASKLII3QAwMmj0AEgCQodAJKg0AEgCQodAJKg0AEgCQodAJL4PzSwIOzO08QYAAAAAElFTkSuQmCC",
      "text/plain": [
       "<Figure size 432x288 with 4 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     }
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "source": [
    "#### Rank\n",
    "obj = pd.Series([7,-5,7,4,2,0,4])\n",
    "obj.rank()"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "0    6.5\n",
       "1    1.0\n",
       "2    6.5\n",
       "3    4.5\n",
       "4    3.0\n",
       "5    2.0\n",
       "6    4.5\n",
       "dtype: float64"
      ]
     },
     "metadata": {},
     "execution_count": 15
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "source": [
    "# log,sqrt\n",
    "np.log(1+x)\n",
    "np.sqrt(x+0.5)\n"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "array([[0.70710678,        nan, 1.22474487],\n",
       "       [1.87082869, 1.22474487, 1.58113883],\n",
       "       [0.70710678, 1.22474487,        nan]])"
      ]
     },
     "metadata": {},
     "execution_count": 17
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "source": [
    "## Categorical\n",
    "### Label Encoder\n",
    "#### Alphabetical\n",
    "\n",
    "sex = pd.Series([\"male\", \"female\", \"female\", \"male\"])\n",
    "le = preprocessing.LabelEncoder()  # 获取一个LabelEncoder\n",
    "le = le.fit([\"male\", \"female\"])  # 训练LabelEncoder, 把male编码为0，female编码为1\n",
    "sex = le.transform(sex)  # 使用训练好的LabelEncoder对原数据进行编码\n",
    "print(sex)"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "[1 0 0 1]\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "source": [
    "#### Order of appearance\n",
    "\n",
    "licenses = pd.Series([\"a\", \"b\", \"c\", \"d\", \"b\", \"c\", \"d\"])\n",
    "factorized_licenses = pd.factorize(licenses) \n",
    "print(factorized_licenses)"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(array([0, 1, 2, 3, 1, 2, 3]), Index(['a', 'b', 'c', 'd'], dtype='object'))\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "source": [
    "#### One-hot\n",
    "\n",
    "data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']\n",
    "values = np.array(data)\n",
    "label_encoder = preprocessing.LabelEncoder()  # integer encode\n",
    "integer_encoded = label_encoder.fit_transform(values)\n",
    "onehot_encoder = preprocessing.OneHotEncoder(sparse=False)  # sparse 只存非零值\n",
    "integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)\n",
    "onehot_encoded = onehot_encoder.fit_transform(integer_encoded)\n",
    "print(onehot_encoded)"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "[[1. 0. 0.]\n",
      " [1. 0. 0.]\n",
      " [0. 0. 1.]\n",
      " [1. 0. 0.]\n",
      " [0. 1. 0.]\n",
      " [0. 1. 0.]\n",
      " [0. 0. 1.]\n",
      " [1. 0. 0.]\n",
      " [0. 0. 1.]\n",
      " [0. 1. 0.]]\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "source": [
    "## 文字\n",
    "### Bag of Words\n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer \n",
    "cv = CountVectorizer(ngram_range=(1, 1), binary=False, token_pattern='\\w{1,}') \n",
    "vec = cv.fit_transform(['I love you', 'you are my angle are'] ) \n",
    "print(cv.get_feature_names()) # 特征名称 \n",
    "print(vec) \n",
    "df = pd.DataFrame(vec.toarray(), columns=cv.get_feature_names()) # to DataFrame \n",
    "print(df.head())"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "['angle', 'are', 'i', 'love', 'my', 'you']\n",
      "  (0, 2)\t1\n",
      "  (0, 3)\t1\n",
      "  (0, 5)\t1\n",
      "  (1, 5)\t1\n",
      "  (1, 1)\t2\n",
      "  (1, 4)\t1\n",
      "  (1, 0)\t1\n",
      "   angle  are  i  love  my  you\n",
      "0      0    0  1     1   0    1\n",
      "1      1    2  0     0   1    1\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "source": [
    "# TF-IDF\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "tv = TfidfVectorizer(binary=False, decode_error='ignore', stop_words='english')\n",
    "vec = tv.fit_transform(['hello world','this is a panda.'])#传入句子组成的list\n",
    "print(tv.get_feature_names()) # 特征名称 \n",
    "print(vec) \n",
    "df = pd.DataFrame(vec.toarray(), columns=tv.get_feature_names()) # to DataFrame \n",
    "print(df.head())"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "['hello', 'panda', 'world']\n",
      "  (0, 2)\t0.7071067811865476\n",
      "  (0, 0)\t0.7071067811865476\n",
      "  (1, 1)\t1.0\n",
      "      hello  panda     world\n",
      "0  0.707107    0.0  0.707107\n",
      "1  0.000000    1.0  0.000000\n"
     ]
    }
   ],
   "metadata": {}
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.6.8 64-bit ('base': conda)"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "interpreter": {
   "hash": "1fef96a86254b5b64b7294805a674893d583399788ea545149c5bfbe00efcc65"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}